{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.models import Word2Vec, KeyedVectors\n",
    "\n",
    "from entity.Paragraph import Paragraph\n",
    "from entity.Post import Post\n",
    "from formula import relevance\n",
    "from preprocessor.preprocessing import PreprocessPostContent\n",
    "from utils import idf, search_es, mmr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load word2vec model\n",
    "model = KeyedVectors.load_word2vec_format(\"w2v_model.bin\", binary=True)\n",
    "# load idf_dict\n",
    "idf_dict = idf.from_csv(file_name=\"idf.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hash tables become quite inefficient when there are many collisions While extremely uneven hash distributions are extremely unlikely to arise by chance malicious adversary with knowledge of the hash function may be able to supply information to hash that creates worst-case behavior by causing excessive collisions resulting in very poor performance e.g. denial of service attack 21 In critical applications universal hashing can be used data structure with better worst-case guarantees may be preferable 22\n",
      "I understand that the number was determined by empirical testing rather than by theoretical analysis thorough theoretical analysis would be difficult However load factor of NUM combined with good hash function is probably sufficient to keep hash chains down to NUM or NUM in the vast majority of cases and that results in fast average lookup times\n",
      "I think you get mixed up between hash collision and key collision Put it simply hash table consist of collection of linked lists When you add new key value pairs it is distributed into the buckets by the key's hash value\n",
      "HashTable has been deprecated long time ago The main difference is that HashTable is synchronized internally while HashMap is not This is however seen as disadvantage because using the HashTable on single thread incurs the penalty of locking and unlocking when it is not needed\n",
      "By design hashtable is not meant to store duplicate keys\n"
     ]
    }
   ],
   "source": [
    "# give a query string and search from es return top 1000 result\n",
    "query = \"Differences of HashMap HashTable in Java?\"\n",
    "posts = search_es.search(title=query, body=query, size=200)\n",
    "# create post_obj_list\n",
    "post_obj_list = []\n",
    "preprocessor = PreprocessPostContent()\n",
    "count = 0\n",
    "for post in posts:\n",
    "    post_obj_list.append(Post(preprocessor.get_single_para_word_list(post['Title']), post['answers']))\n",
    "\n",
    "# calculate post score and sort\n",
    "query_word_list = preprocessor.get_single_para_word_list(query)\n",
    "for post_obj in post_obj_list:\n",
    "    post_obj.calculate_score(model, query_word_list, idf_dict)\n",
    "# select top 5 post create paragraph_obj_list\n",
    "post_obj_list.sort(reverse=True)\n",
    "paragraph_obj_list = []\n",
    "for i in range(5):\n",
    "    answer_list = post_obj_list[i].answer_list\n",
    "    for answer in answer_list:\n",
    "        paragraph_list = preprocessor.getParagraphs(answer['Body'])\n",
    "        for pos, para in enumerate(paragraph_list, 1):\n",
    "            paragraph_obj_list.append(\n",
    "                Paragraph(para, preprocessor.get_single_para_word_list(para), vote_score=answer['Score'],position=pos))\n",
    "            \n",
    "# calculate score and sort\n",
    "relevance_list = []\n",
    "entropy_list = []\n",
    "vote_score_list = []\n",
    "for para_obj in paragraph_obj_list:\n",
    "    para_obj.cal_relevance(model, query_word_list, idf_dict)\n",
    "    para_obj.cal_entropy(idf_dict)\n",
    "    para_obj.cal_semantic_pattern()\n",
    "    para_obj.cal_format_pattern()\n",
    "    para_obj.cal_pos_score()\n",
    "    relevance_list.append(para_obj.relevance_score)\n",
    "    entropy_list.append(para_obj.infor_entropy)\n",
    "    vote_score_list.append(para_obj.vote_score)\n",
    "\n",
    "for para_obj in paragraph_obj_list:\n",
    "    para_obj.normalized(min(relevance_list), max(relevance_list), min(entropy_list), max(entropy_list),\n",
    "                        min(vote_score_list), max(vote_score_list))\n",
    "    para_obj.cal_overall_score()\n",
    "\n",
    "paragraph_obj_list.sort(reverse=True)\n",
    "# select top n paragraph and cal mmr\n",
    "n = 10\n",
    "paragraph_obj_list = paragraph_obj_list[0:n]\n",
    "element_obj_list = mmr.cal_mmr(n, paragraph_obj_list, model, idf_dict)\n",
    "element_obj_list.sort()\n",
    "# get top k mmr index\n",
    "k = 5\n",
    "mmr_index_set = mmr.get_top_k_position(k, element_obj_list)\n",
    "# loop paragraph_obj_list[mmr_index]\n",
    "for i in mmr_index_set:\n",
    "    print(\" \".join(paragraph_obj_list[i].word_list))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
